{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math\n",
    "import random"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.数据集载入\n",
    "读取并通过取反选择其中两类数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal length</th>\n",
       "      <th>sepal width</th>\n",
       "      <th>petal length</th>\n",
       "      <th>petal width</th>\n",
       "      <th>Species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal length  sepal width  petal length  petal width Species\n",
       "0           5.1          3.5           1.4          0.2  Setosa\n",
       "1           4.9          3.0           1.4          0.2  Setosa\n",
       "2           4.7          3.2           1.3          0.2  Setosa\n",
       "3           4.6          3.1           1.5          0.2  Setosa\n",
       "4           5.0          3.6           1.4          0.2  Setosa"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_df = pd.read_csv('IrisData.csv')\n",
    "#data_df = data_df[~data_df[\"Species\"].isin([\"Setosa\"])]\n",
    "data_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.划分训练集与测试集\n",
    "参数ratio为训练集的占比,返回值为列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def splitData(data_list,ratio):\n",
    "    train_size = int(len(data_list)*ratio)\n",
    "    random.shuffle(data_list)\n",
    "    train_set = data_list[:train_size]\n",
    "    test_set = data_list[train_size:]\n",
    "    return train_set,test_set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Split 150 samples into 105 train and 45 test samples \n"
     ]
    }
   ],
   "source": [
    "data_list = np.array(data_df).tolist()\n",
    "trainset,testset = splitData(data_list,ratio = 0.7)\n",
    "print('Split {0} samples into {1} train and {2} test samples '.format(len(data_df), len(trainset), len(testset)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.提取数据特征\n",
    "所收集的训练数据的特征，包含相对于每个类的每个属性的均值和标准差。举例来说，这里有2个类和4个数值属性，然后我们需要每一个属性（4）和类（2）的组合的均值和标准差，也就是8个属性特征。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.1 按类别划分数据，返回值为划分好的数据，以及划分好的数据集中每个类别的样本数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def seprateByClass(dataset):\n",
    "    seprate_dict = {}\n",
    "    info_dict = {}\n",
    "    for vector in dataset:\n",
    "        if vector[-1] not in seprate_dict:\n",
    "            seprate_dict[vector[-1]] = []\n",
    "            info_dict[vector[-1]] = 0\n",
    "        seprate_dict[vector[-1]].append(vector)\n",
    "        info_dict[vector[-1]] +=1\n",
    "    return seprate_dict,info_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Setosa': 41, 'Versicolour': 33, 'Virginica': 31}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_separated,train_info = seprateByClass(trainset)\n",
    "train_info"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.2 计算均值和方差\n",
    "$ var = \\frac{\\sum(x-avg)^{2}}{n-1}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mean(list):\n",
    "    list = [float(x) for x in list] #字符串转数字\n",
    "    return sum(list)/float(len(list))\n",
    "def var(list):\n",
    "    list = [float(x) for x in list]\n",
    "    avg = mean(list)\n",
    "    var = sum([math.pow((x-avg),2) for x in list])/float(len(list)-1)\n",
    "    return var"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.3 计算每个属性的均值和方差\n",
    "zip函数将数据样本按照属性分组为一个个列表，然后可以对每个属性计算均值和标准差。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarizeAttribute(dataset):\n",
    "    dataset = np.delete(dataset,-1,axis = 1) # delete label\n",
    "    summaries = [(mean(attr),var(attr)) for attr in zip(*dataset)]\n",
    "    return summaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(5.758095238095239, 0.7345732600732595),\n",
       " (3.065714285714285, 0.18592857142857133),\n",
       " (3.5533333333333323, 3.2627051282051274),\n",
       " (1.1142857142857148, 0.6014285714285714)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary = summarizeAttribute(trainset)\n",
    "summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.4 按类别提取属性特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarizeByClass(dataset):\n",
    "    dataset_separated,dataset_info = seprateByClass(dataset)\n",
    "    summarize_by_class = {}\n",
    "    for classValue, vector in dataset_separated.items():\n",
    "        summarize_by_class[classValue] = summarizeAttribute(vector)\n",
    "    return summarize_by_class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Setosa': [(4.982926829268291, 0.12445121951219511),\n",
       "  (3.3975609756097565, 0.1417439024390244),\n",
       "  (1.4707317073170731, 0.03412195121951221),\n",
       "  (0.24390243902439032, 0.012024390243902434)],\n",
       " 'Versicolour': [(5.933333333333334, 0.2766666666666667),\n",
       "  (2.7909090909090906, 0.08960227272727272),\n",
       "  (4.254545454545454, 0.23755681818181815),\n",
       "  (1.33030303030303, 0.03905303030303031)],\n",
       " 'Virginica': [(6.596774193548387, 0.5036559139784946),\n",
       "  (2.9193548387096775, 0.10427956989247314),\n",
       "  (5.5612903225806445, 0.37711827956989247),\n",
       "  (2.0354838709677416, 0.06369892473118278)]}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_Summary_by_class = summarizeByClass(trainset)\n",
    "train_Summary_by_class"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. 贝叶斯分类器，计算概率并选择具有最大概率的类作为预测结果"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 算法流程\n",
    "由于此处的数值为连续值，所以在计算类条件概率的时候要用概率密度函数\n",
    "1. 计算样本属于某类的先验概率p[yi]， //属于A类的概率，B类的概率....\n",
    "2. 通过概率密度函数计算 p(xi|c) ，首先计算在一个属性的前提下，该样本属于某类的概率；相乘合并所有属性的概率，即为某个数据样本属于某类的类条件概率 // 若为离散值，则通过类别数量比值计算p(xi|c)\n",
    "3. 计算1和2的乘积，结果的最大值就是该样本所属的类别"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.1 先验概率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calulateClassPriorProb(dataset,dataset_info):\n",
    "    dataset_prior_prob = {}\n",
    "    sample_sum = len(dataset)\n",
    "    for class_value, sample_nums in dataset_info.items():\n",
    "        dataset_prior_prob[class_value] = sample_nums/float(sample_sum)\n",
    "    return dataset_prior_prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Setosa': 0.3904761904761905,\n",
       " 'Versicolour': 0.3142857142857143,\n",
       " 'Virginica': 0.29523809523809524}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prior_prob = calulateClassPriorProb(trainset,train_info)\n",
    "prior_prob"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.2 类条件概率\n",
    "计算类条件概率 $\\prod_{i=1}^d p(xi|c)$,计算各特征的各条件概率的乘积，如下所示：\n",
    " - 判断为A类的概率：p(A|特征1)*p(A|特征2)*p(A|特征3)*p(A|特征4).....\n",
    " - 判断为B类的概率：p(B|特征1)*p(B|特征2)*p(B|特征3)*p(B|特征4).....\n",
    " - 判断为C类的概率：p(C|特征1)*p(C|特征2)*p(C|特征3)*p(C|特征4).....\n",
    " \n",
    "$ p(xi|c) = \\frac{1}{\\sqrt{2\\pi}\\sigma_{c,i}}exp(-\\frac{(xi-mean_{c,i})^{2}}{2\\sigma_{c,i}^{2}})$ , $\\sigma$是标准差（方差开方）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculateProb(x,mean,var):\n",
    "    exponent = math.exp(math.pow((x-mean),2)/(-2*var))\n",
    "    p = (1/math.sqrt(2*math.pi*var))*exponent\n",
    "    return p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculateClassProb(input_data,train_Summary_by_class):\n",
    "    prob = {}\n",
    "    for class_value, summary in train_Summary_by_class.items():\n",
    "        prob[class_value] = 1\n",
    "        for i in range(len(summary)):\n",
    "            mean,var = summary[i]\n",
    "            x = input_data[i]\n",
    "            p = calculateProb(x,mean,var)\n",
    "        prob[class_value] *=p\n",
    "    return prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Setosa': 3.3579279836005993,\n",
       " 'Versicolour': 1.5896628317396685e-07,\n",
       " 'Virginica': 5.176617264913899e-12}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_vector = testset[1]\n",
    "input_data = input_vector[:-1]\n",
    "train_Summary_by_class = summarizeByClass(trainset)\n",
    "class_prob = calculateClassProb(input_data,train_Summary_by_class)\n",
    "class_prob"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.3 先验概率*类条件概率\n",
    "由于P（x）在一个样本中是相等的，所以贝叶斯分类器只需要比较分子部分：先验概率*类条件概率，最终属于哪类的概率最大，则判别为哪类，此处为最小错误率贝叶斯分类，若采用最小风险需要加上判断为每个类别的风险损失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bayesianPredictOneSample(input_data):\n",
    "    prior_prob = calulateClassPriorProb(trainset,train_info)\n",
    "    train_Summary_by_class = summarizeByClass(trainset)\n",
    "    classprob_dict = calculateClassProb(input_data,train_Summary_by_class)\n",
    "    result = {}\n",
    "    for class_value,class_prob in classprob_dict.items():\n",
    "        p = class_prob*prior_prob[class_value]\n",
    "        result[class_value] = p\n",
    "    return max(result,key=result.get)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.利用分类器进行预测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 5.1 单个样本预测类别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the sameple is predicted to class: Versicolour.\n"
     ]
    }
   ],
   "source": [
    "input_vector = testset[1]\n",
    "input_data = input_vector[:-1]\n",
    "result = bayesianPredictOneSample(input_data)\n",
    "print(\"the sameple is predicted to class: {0}.\".format(result))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 5.2 对测试集进行判别，并计算分类准确率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculateAccByBeyesian(dataset):\n",
    "    correct = 0\n",
    "    for vector in dataset:\n",
    "        input_data = vector[:-1]\n",
    "        label = vector[-1]\n",
    "        result = bayesianPredictOneSample(input_data)\n",
    "        if result == label:\n",
    "            correct+=1\n",
    "    return correct/len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9333333333333333"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "acc = calculateAccByBeyesian(testset)\n",
    "acc"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
